##### ####################################################
#####                                               ######
#####                 Influence analysis            
#####                                               ######
##### ####################################################

# init ------------------------------------------------------------

rm(list=ls())
set.seed(221186)

# Load libraries

library(data.table) # 1.11.4
library(mgcv) # 1.8-24
library(zoo) # 1.8.4
library(stargazer) # 5.2.2
source("utils.R")

# Load data

load("working/speeches_influence.Rdata")
load("working/speech_refs.Rdata")

# Remove debates where no minister is present
speeches <- speeches[speeches$minister_present == T]

## Recode yearmon to numeric
speeches$yearmon_numeric <- as.numeric(speeches$yearmon)-1997

## Remove observations for which there is no dependent variable

missing.obs <- unique(c(which(is.na(speeches$eigen)), which(is.na(speeches$page_rank))))
speeches <- speeches[-missing.obs]

## Redefine a couple of variables

speeches[,hpos.zero.one:=(hpos.new-min(hpos.new))/(max(hpos.new)-min(hpos.new)),by=section_id]

speeches[,degree := degree/.N,by = subsection_id]

out <- speeches[, list(auth = sum(auth,na.rm=T), 
                       page_rank = sum(page_rank,na.rm=T), 
                       eigen = sum(eigen,na.rm=T), 
                       degree = sum(degree,na.rm=T), 
                       hpos.zero.one = mean(hpos.zero.one,na.rm = T),
                       minister_in_debate = unique(minister_in_debate),
                       speaker = unique(is_speaker),
                       Gender = unique(Gender),
                       Party = unique(party_short),
                       minister_gender = unique(minister_gender),
                       debate_department = unique(debate_department),
                       yearmon = unique(yearmon),
                       yearmon_numeric = unique(yearmon_numeric)),by = list(subsection_id, twfy_person_id)]

out$ministry <- as.factor(out$debate_department)

##########
## Individual level models 
##########

## Baseline models for f-test

baseline_formulas <- list("~ Party",
                 "~ Party + debate_department",
                 "~ Party + as.factor(yearmon)",
                 "~ Party + debate_department + as.factor(yearmon)",
                 "~ Party + debate_department*yearmon_numeric + as.factor(yearmon)",
                 "~ Party + debate_department*poly(yearmon_numeric,2) + as.factor(yearmon)")

baseline_out <- collect_models_function(data = out[minister_in_debate==F & speaker == F], 
                                         dv = "page_rank", 
                                         bootstrap.reps=10, 
                                         run.boot=F, 
                                         formulas = baseline_formulas)

## Interaction specification

formulas <- list("~ minister_gender*Gender + Party",
                 "~ minister_gender*Gender + Party + debate_department",
                 "~ minister_gender*Gender + Party + as.factor(yearmon)",
                 "~ minister_gender*Gender + Party + debate_department + as.factor(yearmon)",
                 "~ minister_gender*Gender + Party + debate_department*yearmon_numeric + as.factor(yearmon)",
                 "~ minister_gender*Gender + Party + debate_department*poly(yearmon_numeric,2) + as.factor(yearmon)")

formula.mod.gam <- as.formula("page_rank ~ minister_gender*Gender + s(yearmon_numeric, by = ministry) + ministry + as.factor(yearmon)")
formula.mod.gam.auth <- as.formula("auth ~ minister_gender*Gender + s(yearmon_numeric, by = ministry) + ministry + as.factor(yearmon)")

# Pagerank models

page_rank_out <- collect_models_function(data = out[minister_in_debate==F & speaker == F], 
                                         dv = "page_rank", 
                                         bootstrap.reps=10, 
                                         run.boot=F, 
                                         formulas = formulas)

page_rank_out$mod.7 <- gam(formula.mod.gam, data = out[minister_in_debate==F & speaker == F])

f_stats <- c(anova(baseline_out$mod.1, page_rank_out$mod.1)$F[2],
anova(baseline_out$mod.2, page_rank_out$mod.2)$F[2],
anova(baseline_out$mod.3, page_rank_out$mod.3)$F[2],
anova(baseline_out$mod.4, page_rank_out$mod.4)$F[2],
anova(baseline_out$mod.5, page_rank_out$mod.5)$F[2],
anova(baseline_out$mod.6, page_rank_out$mod.6)$F[2])

# Auth models

auth_out <- collect_models_function(data = out[minister_in_debate==F & speaker == F], 
                                    dv = "auth", 
                                    bootstrap.reps=10, 
                                    run.boot=F, 
                                    formulas = formulas)

auth_out$mod.7 <- gam(formula.mod.gam.auth, data=out[minister_in_debate==F & speaker == F])

## Split-sample specification

formulas_split <- list("~ minister_gender + Party",
                 "~ minister_gender + Party + debate_department",
                 "~ minister_gender + Party + as.factor(yearmon)",
                 "~ minister_gender + Party + debate_department + as.factor(yearmon)",
                 "~ minister_gender + Party + debate_department*yearmon_numeric + as.factor(yearmon)",
                 "~ minister_gender + Party + debate_department*poly(yearmon_numeric,2) + as.factor(yearmon)")

formula_gam_split <- as.formula("page_rank ~ minister_gender + s(yearmon_numeric, by = ministry) + ministry + as.factor(yearmon)")

out$ministry <- as.factor(out$debate_department)

# Models for women

page_rank_out_split_female <- collect_models_function(data = out[minister_in_debate==F & speaker == F & Gender == "F"], 
                                                      dv = "page_rank", bootstrap.reps=10, 
                                                      run.boot=F, 
                                                      formulas = formulas_split)

page_rank_out_split_female$mod.7 <- gam(formula_gam_split, data=out[minister_in_debate==F & speaker == F & Gender == "F"])

# Models for men

page_rank_out_split_male <- collect_models_function(data = out[minister_in_debate==F & speaker == F & Gender == "M"], 
                                                    dv = "page_rank", bootstrap.reps=10, 
                                                    run.boot=F, 
                                                    formulas = formulas_split)

page_rank_out_split_male$mod.7 <- gam(formula_gam_split, data=out[minister_in_debate==F & speaker == F & Gender == "M"])

## Tables

mod_stargazer <- function(...){
  output <- capture.output(stargazer(...))
  # The first three lines are the ones we want to remove...
  output <- output[grep("begin\\{tabular\\}",output):(grep("textit\\{Note:\\}",output)-1)]
  # cat out the results - this is essentially just what stargazer does too
  cat(paste(output, collapse = "\n"), "\n")
}

stargazer.mult.function <- function(model.object, title = "Eigenvector centrality", se = "boot", vars.to.keep = c("^minister_genderF$","^GenderF$","^minister_genderF:GenderF$","Constant"), covar.labs = c("Constant","Female minister", "Female MP", "Interaction"), include_fstats = T){
  
  party.fe <- c("Party FEs","$\\checkmark$","\\checkmark","$\\checkmark$","\\checkmark","\\checkmark","\\checkmark","\\checkmark")  
  ministry.fe <- c("Ministry FEs","$\\times$","\\checkmark","$\\times$","\\checkmark","\\checkmark","\\checkmark","\\checkmark")
  month.fe <- c("Month FEs","$\\times$","$\\times$","\\checkmark","\\checkmark","\\checkmark","\\checkmark","\\checkmark")
  time.trend.linear <- c("Linear time trends","$\\times$","$\\times$","$\\times$", "$\\times$","\\checkmark","\\checkmark","$\\times$")
  time.trend.quadratic <- c("Quadratic time trends","$\\times$","$\\times$","$\\times$", "$\\times$","$\\times$","\\checkmark","$\\times$")
  time.trend.gam <- c("Flexible time trends","$\\times$","$\\times$","$\\times$", "$\\times$","$\\times$","$\\times$","\\checkmark")		
  if(se == "regular") se.list <- list(model.object$mod.1$se, model.object$mod.2$se, model.object$mod.3$se, model.object$mod.4$se, model.object$mod.5$se)
  if(se == "clustered") se.list <- list(model.object$mod.1$clustered.se, model.object$mod.2$clustered.se, model.object$mod.3$clustered.se, model.object$mod.4$clustered.se, model.object$mod.5$clustered.se, model.object$mod.6$clustered.se, summary(model.object$mod.7)$se)
  if(se == "boot") se.list <- list(model.object$mod.1$boot.se, model.object$mod.2$boot.se, model.object$mod.3$boot.se, model.object$mod.4$boot.se, model.object$mod.5$boot.se)	
  
  
    lines.to.add <- list(party.fe, ministry.fe, month.fe, time.trend.linear, time.trend.quadratic, time.trend.gam)
  
  
  mod_stargazer(
    model.object$mod.1, model.object$mod.2, model.object$mod.3, model.object$mod.4, model.object$mod.5, model.object$mod.6, model.object$mod.7,
    intercept.top=T, intercept.bottom=F, 
    keep= vars.to.keep,
    add.lines = lines.to.add,
    covariate.labels= covar.labs,
    column.sep.width="0.25pt", keep.stat=c("n","rsq","adj.rsq"),   
    dep.var.labels=c(paste0("\\emph{",title,"}")), dep.var.caption="",
    no.space=T, se = se.list, model.names=F)	
}

sink(file="latex/tables/influence_female_pagerank.tex")
stargazer.mult.function(page_rank_out_split_female, title = "Influence (female MPs)", se = "clustered", vars.to.keep = "^minister_genderF$", covar.labs = "Female minister")
sink()

sink(file="latex/tables/influence_male_pagerank.tex")
stargazer.mult.function(page_rank_out_split_male, title = "Influence (male MPs)", se = "clustered", vars.to.keep = "^minister_genderF$", covar.labs = "Female minister")
sink()

sink(file="latex/tables/influence_pagerank_interaction.tex")
stargazer.mult.function(page_rank_out, title = "Influence", se = "clustered")
sink()

sink(file="latex/tables/influence_auth_interaction.tex")
stargazer.mult.function(auth_out, title = "Influence", se = "clustered")
sink()

## ####################################
## Plot effect sizes
## ####################################

sum.coefficients.ci <- function(model, mod.vcov=NULL, var2 = "^minister_genderF$", var1 = "^GenderF", int = "^minister_genderF:GenderF$", baseline.var2, baseline.int){
  
  if(is.null(mod.vcov)) mod.vcov <- vcov(model)
  
  mod.coef <- na.omit(coef(model))
  colnames(mod.vcov) <- names(mod.coef)
  rownames(mod.vcov) <- names(mod.coef)
  
  ## Find the variables in the covariance matrix
  var1.position <- grep(var1, colnames(mod.vcov))
  var2.position <- grep(var2, colnames(mod.vcov))
  interaction.position <- grep(int, colnames(mod.vcov))
  
  # Find the variances
  var1.var <- diag(mod.vcov)[var1.position]
  var2.var <- diag(mod.vcov)[var2.position]
  interaction.var <- diag(mod.vcov)[interaction.position]
  
  # Find the covariance
  var2var3.covariance <- mod.vcov[var2.position,interaction.position]
  
  # Point esitmates for b2 and b3
  var1.pe <- mod.coef[var1.position]
  var2.pe <- mod.coef[var2.position]
  interaction.pe <- mod.coef[interaction.position]	
  
  # Sum the point estimates for b2 and b3
  pe.for.sum <- mod.coef[var2.position] + mod.coef[interaction.position]
  
  # Find the standard error
  se.for.sum <- sqrt(var2.var + interaction.var + 2*(var2var3.covariance))
  
  # Find the confidence intervals
  var1.upper <- var1.pe + 1.96 * sqrt(var1.var)
  var1.lower <- var1.pe - 1.96 * sqrt(var1.var)
  var2.upper <- var2.pe + 1.96 * sqrt(var2.var)
  var2.lower <- var2.pe - 1.96 * sqrt(var2.var)
  interaction.upper <- interaction.pe + 1.96 * sqrt(interaction.var)
  interaction.lower <- interaction.pe - 1.96 * sqrt(interaction.var)
  sum.upper <- pe.for.sum + 1.96 * se.for.sum
  sum.lower <- pe.for.sum - 1.96 * se.for.sum
  
  # Is the interaction significant?
  sig.test  <- coeftest(model, vcov = mod.vcov)
  sig.test <- sig.test[interaction.position,4]<0.05
  
  effect.size.var2 <- var2.pe/baseline.var2
  effect.size.var2.lower <- var2.lower/baseline.var2
  effect.size.var2.upper <- var2.upper/baseline.var2
  
  effect.size.int <- pe.for.sum/baseline.int
  effect.size.int.lower <- sum.lower/baseline.int
  effect.size.int.upper <- sum.upper/baseline.int
  
  # Bind it all together
  out.coef <- data.frame(var1.pe, var1.se= sqrt(var1.var), var1.lower, var1.upper,var2.pe, var2.se = sqrt(var2.var), var2.lower, var2.upper, pe.for.sum, se.for.sum, sum.lower, sum.upper, interaction.pe, interaction.se = sqrt(interaction.var), interaction.upper, interaction.lower, effect.size.var2 , effect.size.var2.lower, effect.size.var2.upper, effect.size.int , effect.size.int.lower, effect.size.int.upper, sig.test)
  return(out.coef)
}

plot.effects.function <- function(mod_list, baseline.male, baseline.female, plot_suffix = "auth"){
 
  model1est <- sum.coefficients.ci(model = mod_list$mod.1, mod.vcov = mod_list$mod.1$clus.vcov, baseline.var2 = baseline.male, baseline.int = baseline.female)
  model2est <- sum.coefficients.ci(model = mod_list$mod.2, mod.vcov = mod_list$mod.2$clus.vcov, baseline.var2 = baseline.male, baseline.int = baseline.female)
  model3est <- sum.coefficients.ci(model = mod_list$mod.3, mod.vcov = mod_list$mod.3$clus.vcov, baseline.var2 = baseline.male, baseline.int = baseline.female)
  model4est <- sum.coefficients.ci(model = mod_list$mod.4, mod.vcov = mod_list$mod.4$clus.vcov, baseline.var2 = baseline.male, baseline.int = baseline.female)
  model5est <- sum.coefficients.ci(model = mod_list$mod.5, mod.vcov = mod_list$mod.5$clus.vcov, baseline.var2 = baseline.male, baseline.int = baseline.female)
  model6est <- sum.coefficients.ci(model = mod_list$mod.6, mod.vcov = mod_list$mod.6$clus.vcov, baseline.var2 = baseline.male, baseline.int = baseline.female)
  modelGAMest <- sum.coefficients.ci(model = mod_list$mod.7, baseline.var2 = baseline.male, baseline.int = baseline.female)
  
  female.ys <- rev(seq(0.5,3.5,0.5))
  male.ys <- rev(seq(4,7,0.5))
  
  male.col <- "gray"
  female.col <- "black"
  
  background.col <- "#F3F4E8"
  
  for.plotting <- rbind(model1est, model2est, model3est, model4est, model5est, model6est, modelGAMest)
  xlims <- range(c(for.plotting[,c("var1.lower","var1.upper","sum.upper","sum.lower")],-0.001))
  
  for.plotting <- for.plotting * 100
  xlims <- range(c(for.plotting[,c("effect.size.var2.lower","effect.size.var2.upper","effect.size.int.upper","effect.size.int.lower")]))
  lwd.choice <- 4
  png(paste0("plots/influence_effect_size_",plot_suffix,".png"),1200,550)
  par(mar=c(4,4,0,11), xpd=F, cex=2, bg="transparent")
  plot(for.plotting$effect.size.int, female.ys, xlim=xlims, ylim = c(0, 7.5), col= female.col, pch=19, bty="n", xlab="Effect size (%) relative to male minister baseline", yaxt="n", ylab="")
  segments(x0 = for.plotting$effect.size.int.lower, x1 = for.plotting$effect.size.int.upper, y0= female.ys, col= female.col, lty=c(1:6,3), lwd=lwd.choice)
  points(for.plotting$effect.size.var2, male.ys, col= male.col, pch=19)
  segments(x0 = for.plotting$effect.size.var2.lower, x1 = for.plotting$effect.size.var2.upper, y0= male.ys, col= male.col, lty=c(1:6,3), lwd=lwd.choice)
  abline(v=0, lty=3, lwd=lwd.choice)
  par(xpd=T)
  legend(x=max(xlims),y=6.5, legend=c("Male", "Female"), col=c(male.col, female.col), lty=1, bty="n", cex=1, lwd=lwd.choice)
  legend(x=max(xlims),y=4, legend = c("Base model",expression(+ lambda[m0]),expression(+delta[t]), expression(+lambda["m0"]+delta[t]), expression(+lambda[m0] + delta[t] +lambda[m1]*t),expression(+lambda[m0] + delta[t] +lambda[m1]*t + lambda[m1]*t^2),"GAM"), lty=c(1:6,3), bty="n", cex=1, lwd=lwd.choice)
  dev.off()
  
}

# Authority

baseline.male.auth <- speeches[minister_in_debate==F & minister_gender=="M" & Gender=="M",mean(auth,na.rm=T)]
baseline.female.auth <- speeches[minister_in_debate==F & minister_gender=="M" & Gender=="F",mean(auth,na.rm=T)]

plot.effects.function(auth_out,  baseline.male.auth, baseline.female.auth, plot_suffix = "auth")

# Page rank

baseline.male.page.rank <- out[minister_in_debate==F & minister_gender=="M" & Gender=="M",mean(page_rank,na.rm=T)]
baseline.female.page.rank <- out[minister_in_debate==F & minister_gender=="M" & Gender=="F",mean(page_rank,na.rm=T)]

plot.effects.function(page_rank_out, baseline.male.page.rank, baseline.female.page.rank, plot_suffix = "page_rank")

## Save the output of model 6 to use in the write up

model6est <- sum.coefficients.ci(model = page_rank_out$mod.6, mod.vcov = page_rank_out$mod.6$clus.vcov, baseline.var2 = baseline.male.page.rank, baseline.int = baseline.female.page.rank)
model6est <- model6est*100

sink("latex/tables/influence_effect_size.tex")
cat(paste0(round(model6est$effect.size.int),"\\% [",round(model6est$effect.size.int.lower),"\\%, ", round(model6est$effect.size.int.upper),"\\%]"))
sink()

##########
## Debate-level model (page rank only)
##########

formulas <- list("~ minister_gender*Gender",
                 "~ minister_gender*Gender + debate_department",
                 "~ minister_gender*Gender + as.factor(yearmon)",
                 "~ minister_gender*Gender + debate_department + as.factor(yearmon)",
                 "~ minister_gender*Gender + debate_department*yearmon_numeric + as.factor(yearmon)",
                 "~ minister_gender*Gender + debate_department*poly(yearmon_numeric,2) + as.factor(yearmon)")

out_debate <- out[minister_in_debate==F & speaker == F,
                  list(page_rank = mean(page_rank), 
                       auth = mean(auth), 
                       n = .N, 
                       minister_gender = unique(minister_gender), 
                       debate_department = unique(debate_department),
                       yearmon = unique(yearmon),
                       yearmon_numeric = unique(yearmon_numeric)
                       ),
                  by = list(subsection_id,Gender)]

# Lm
page_rank_out_debate <- collect_models_function(formulas = formulas, 
                                                dv = "page_rank", 
                                                data = out_debate, 
                                                weights = out_debate$n, 
                                                run.boot = T, 
                                                bootstrap.reps = 500)

# Gam
out_debate$ministry <- as.factor(out_debate$debate_department)

formula.mod.gam <- as.formula("page_rank ~ minister_gender*Gender + s(yearmon_numeric, by = ministry) + ministry + as.factor(yearmon)")
page_rank_out_debate$mod.7 <- gam(formula.mod.gam, data = out_debate, weights = out_debate$n)

sink(file="latex/tables/centrality_debate_pagerank_tables.tex")
stargazer.mult.function(page_rank_out_debate, title = "Influence", se = "boot")
sink()


##########
## Are ministers more influential than backbenchers? Is the speaker less influential?
##########

valid.mod1 <- lm(auth ~ minister_in_debate, data = out)
valid.mod2 <- lm(auth ~ speaker, data = out)
valid.mod3 <- lm(auth ~ minister_in_debate + speaker, data = out)

baseline_backbench <- mean(out[minister_in_debate==F & speaker==F]$auth)

coef(valid.mod3)[2]/coef(valid.mod3)[1]
coef(valid.mod3)[3]/coef(valid.mod3)[1]
vars.to.keep = c("^minister_in_debateTRUE$","^speakerTRUE$","Constant")

sink("latex/tables/influence_validity_checks.tex")
mod_stargazer(
   valid.mod1, valid.mod2, valid.mod3,
   keep = vars.to.keep,
   intercept.top=F, intercept.bottom=T,
   covariate.labels = c("Minister", "Speaker", "Constant"),
   column.sep.width="1pt", keep.stat=c("n","rsq"),
   dep.var.caption="",
  no.space=T,
   dep.var.labels = "\\emph{Influence}"
 )
 sink()

##########
## Is influence just speech length? Is it just debate position?
##########

correlation <- speeches[,list(date = unique(hdate),
                  cor.words = (cor(page_rank, word_count)),
                  cor.pos = (cor(page_rank, hpos.zero.one))),by =  subsection_id]

correlation$year <- substring(as.character(correlation$date),1,4)

png("plots/diagnostics/influence_correlation.png",1200,500)
par(mfrow=c(1,2),bg="transparent", cex=1.7, mar=c(3,4,3,2)+0.1)
boxplot(correlation$cor.words ~ correlation$year, main="Speech length & influence", xlab="Year", ylab="Correlation", las=2, cex.lab=1.2, pch=19, outcex=.5, outcol="gray")
abline(h=0, lty=3, lwd=2)
boxplot(correlation$cor.pos ~ correlation$year, main="Debate position & influence", xlab="Year", ylab="Correlation", las=2, cex.lab=1.2, pch=19, outcex=.5, outcol="gray")
abline(h=0, lty=3, lwd=2)
dev.off()


##########
## Does influence correlate with direct mentions?
##########

speech_refs$yearmon_numeric <- as.numeric(speech_refs$yearmon)
speech_refs$ministry <- as.factor(speech_refs$debate_department)

out <- speeches[, list(auth = sum(auth,na.rm=T), 
                       page_rank = sum(page_rank,na.rm=T), 
                       eigen = sum(eigen,na.rm=T), 
                       degree = sum(degree,na.rm=T), 
                       hpos.zero.one = mean(hpos.zero.one,na.rm = T),
                       minister_in_debate = unique(minister_in_debate),
                       speaker = unique(is_speaker),
                       Gender = unique(Gender),
                       minister_gender = unique(minister_gender),
                       debate_department = unique(debate_department),
                       yearmon = unique(yearmon),
                       yearmon_numeric = unique(yearmon_numeric)),by = list(subsection_id,twfy_person_id)]

out$year <- format(out$yearmon, "%Y")

direct_mentions <- merge(out, speech_refs[,c("subsection_id","twfy_person_id","reference_degree")], by = c("subsection_id","twfy_person_id"))

direct_mentions <- direct_mentions[,list(cor.references = cor(page_rank, reference_degree),
                      year = unique(year)),by = subsection_id]

png("plots/diagnostics/reference_correlation.png",800,800)
par(mfrow=c(1,1),bg="transparent", cex=1.7, mar=c(3,4,3,2)+0.1)
boxplot(direct_mentions$cor.references ~ direct_mentions$year, main="Influence and direct mentions", xlab="Year", ylab="Correlation", las=2, cex.lab=1.2, pch=19, outcex=.5, outcol="gray")
abline(h=0, lty=3, lwd=2)
dev.off()
